データ可視化〜まだExcelで疲弊してるの?

今回だけでggplot2を習得するのは難しいので本当に紹介程度です。 コピペでも全然OKです。 岩嵜先生のとても参考になる資料がよくまとまっています。 資料1 資料2

資料はネット上に日本語、英語問わずたくさんある。

データ可視化は重要。 Rは可視化に強い。特に人気の可視化パッケージであるggplot2を中心に学ぶ。
Pythonだとmatplotlibやseabornが人気のよう。最近のいけてる論文のfigは結構ggplot2で作成されている。 ggplot2の説明は難しいのでとにかくコードを実行してたくさん図を作っていきましょう。
イメージはレイヤを重ねていく感じ

基本
ggplot() # 使うデータを指定
aes() # x軸、y軸など指定
geom_() # 棒グラフか、散布図かヒストグラムなのかなど
theme_() # 背景や軸の見栄え

練習

library(ggplot2)
head(diamonds) # ダイアモンドのデータ
## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
head(mpg) # 自動車の燃費のデータ
## # A tibble: 6 x 11
##   manufacturer model displ  year   cyl trans  drv     cty   hwy fl    class
##   <chr>        <chr> <dbl> <int> <int> <chr>  <chr> <int> <int> <chr> <chr>
## 1 audi         a4      1.8  1999     4 auto(… f        18    29 p     comp…
## 2 audi         a4      1.8  1999     4 manua… f        21    29 p     comp…
## 3 audi         a4      2    2008     4 manua… f        20    31 p     comp…
## 4 audi         a4      2    2008     4 auto(… f        21    30 p     comp…
## 5 audi         a4      2.8  1999     6 auto(… f        16    26 p     comp…
## 6 audi         a4      2.8  1999     6 manua… f        18    26 p     comp…
# ?mpgでこのデータセットの詳細がわかる
# displは排気量, hwyは燃費
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy))

m1 <- ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, colour = class))
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, shape = class))
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 7.
## Consider specifying shapes manually if you must have them.
## Warning: Removed 62 rows containing missing values (geom_point).

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, colour = "red"))

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_wrap(~class)

m2 <- ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, colour = class)) +
  facet_wrap(~class)

m1

m2

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy))

ggplot(data = mpg) +
  geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg) +
  geom_smooth(mapping = aes(x = displ, y = hwy, colour = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
  geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
  geom_point(aes(colour = class)) +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = diamonds, mapping = aes(x = cut)) +
  geom_bar()

ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) +
  geom_bar()

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_boxplot()

a <- ggplot(diamonds, aes(cut, price, fill = cut)) +
  geom_violin(trim = FALSE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = FALSE) +
  theme_light()

a + coord_flip()

ggsave(a + coord_flip(), filename = "violinplot.jpg")
## Saving 7 x 5 in image

画像の保存方法 GUIでやるかggsave関数

練習

getwd()
## [1] "/Users/juyoutai/Desktop/R/2019_handson_ggplot2"
list.files()
##  [1] "2019_handson_ggplot2.Rproj" "20190907_handson.html"     
##  [3] "20190907_handson.Rmd"       "data2_corrected.csv"       
##  [5] "expression_annt.txt"        "ggplot2_cache"             
##  [7] "ggplot2_files"              "ggplot2_script.R"          
##  [9] "ggplot2.html"               "ggplot2.Rmd"               
## [11] "ggpointdensity.html"        "ggpointdensity.Rmd"        
## [13] "heatmap_2.txt"              "hex_bin_ggseqlogo.Rmd"     
## [15] "iris_2.csv"                 "iris.csv"                  
## [17] "iris.tsv"                   "limma_result.txt"          
## [19] "merged_annot.txt"           "violinplot.jpg"
data <- readr::read_csv("data2_corrected.csv")
## Parsed with column specification:
## cols(
##   `Sample ID` = col_double(),
##   Age = col_double(),
##   Gender = col_character(),
##   Height = col_double(),
##   Weight = col_double(),
##   BMI = col_double(),
##   Food_A = col_double()
## )
head(data)
## # A tibble: 6 x 7
##   `Sample ID`   Age Gender Height Weight   BMI Food_A
##         <dbl> <dbl> <chr>   <dbl>  <dbl> <dbl>  <dbl>
## 1           1    36 Female    162     52  19.8     18
## 2           2    13 Female    160     43  16.8      8
## 3           3    20 Female    153     46  19.7     37
## 4           4    24 Male      167     54  19.4     57
## 5           5    22 Female    153     43  18.4     14
## 6           6    48 Male      168     60  21.3     35
class(data)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"
str(data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 80 obs. of  7 variables:
##  $ Sample ID: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age      : num  36 13 20 24 22 48 46 49 26 50 ...
##  $ Gender   : chr  "Female" "Female" "Female" "Male" ...
##  $ Height   : num  162 160 153 167 153 168 153 157 159 153 ...
##  $ Weight   : num  52 43 46 54 43 60 44 44 48 42 ...
##  $ BMI      : num  19.8 16.8 19.7 19.4 18.4 21.3 18.8 17.9 19 17.9 ...
##  $ Food_A   : num  18 8 37 57 14 35 88 100 37 7.2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Sample ID` = col_double(),
##   ..   Age = col_double(),
##   ..   Gender = col_character(),
##   ..   Height = col_double(),
##   ..   Weight = col_double(),
##   ..   BMI = col_double(),
##   ..   Food_A = col_double()
##   .. )
summary(data)
##    Sample ID          Age           Gender              Height     
##  Min.   : 1.00   Min.   :10.00   Length:80          Min.   :129.0  
##  1st Qu.:20.75   1st Qu.:25.75   Class :character   1st Qu.:153.0  
##  Median :40.50   Median :37.00   Mode  :character   Median :158.5  
##  Mean   :40.50   Mean   :35.60                      Mean   :158.6  
##  3rd Qu.:60.25   3rd Qu.:46.00                      3rd Qu.:165.2  
##  Max.   :80.00   Max.   :60.00                      Max.   :177.0  
##      Weight           BMI            Food_A      
##  Min.   :30.00   Min.   :16.00   Min.   :  7.20  
##  1st Qu.:44.75   1st Qu.:19.00   1st Qu.: 37.75  
##  Median :50.00   Median :19.80   Median : 59.00  
##  Mean   :52.08   Mean   :20.62   Mean   : 71.27  
##  3rd Qu.:56.25   3rd Qu.:21.60   3rd Qu.: 93.25  
##  Max.   :80.00   Max.   :27.10   Max.   :360.00
summary(data$Food_A)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7.20   37.75   59.00   71.27   93.25  360.00
mean(data$Food_A) # 平均
## [1] 71.265
sd(data$Food_A) # 標準偏差
## [1] 52.98051
var(data$Food_A) # 分散
## [1] 2806.934
class(data$Gender) # Rではデータの型が重要となる. Genderはcharacterなので、これをfactorに変換する必要がある
## [1] "character"
data$Gender <- factor(data$Gender, #factorに変更
          levels = c("Male", "Female"))  # Maleを1, Femaleを2に指定
class(data$Gender)
## [1] "factor"
summary(data)
##    Sample ID          Age           Gender       Height     
##  Min.   : 1.00   Min.   :10.00   Male  :35   Min.   :129.0  
##  1st Qu.:20.75   1st Qu.:25.75   Female:45   1st Qu.:153.0  
##  Median :40.50   Median :37.00               Median :158.5  
##  Mean   :40.50   Mean   :35.60               Mean   :158.6  
##  3rd Qu.:60.25   3rd Qu.:46.00               3rd Qu.:165.2  
##  Max.   :80.00   Max.   :60.00               Max.   :177.0  
##      Weight           BMI            Food_A      
##  Min.   :30.00   Min.   :16.00   Min.   :  7.20  
##  1st Qu.:44.75   1st Qu.:19.00   1st Qu.: 37.75  
##  Median :50.00   Median :19.80   Median : 59.00  
##  Mean   :52.08   Mean   :20.62   Mean   : 71.27  
##  3rd Qu.:56.25   3rd Qu.:21.60   3rd Qu.: 93.25  
##  Max.   :80.00   Max.   :27.10   Max.   :360.00
summary(subset(data, Gender == 'Female')) #女性のみを抽出
##    Sample ID         Age           Gender       Height     
##  Min.   : 1.0   Min.   :12.00   Male  : 0   Min.   :140.0  
##  1st Qu.:17.0   1st Qu.:24.00   Female:45   1st Qu.:150.0  
##  Median :35.0   Median :33.00               Median :155.0  
##  Mean   :38.2   Mean   :33.38               Mean   :154.7  
##  3rd Qu.:58.0   3rd Qu.:45.00               3rd Qu.:159.0  
##  Max.   :79.0   Max.   :53.00               Max.   :172.0  
##      Weight           BMI           Food_A      
##  Min.   :36.00   Min.   :16.0   Min.   :  7.20  
##  1st Qu.:43.00   1st Qu.:18.9   1st Qu.: 28.00  
##  Median :47.00   Median :19.8   Median : 43.00  
##  Mean   :48.29   Mean   :20.2   Mean   : 61.96  
##  3rd Qu.:51.00   3rd Qu.:21.1   3rd Qu.: 71.00  
##  Max.   :66.00   Max.   :26.4   Max.   :360.00
p <- ggplot(data, aes(Age, Food_A)) +  # 解析対象の列を指定
  geom_point()         # 散布図なのでpointで作図することを指定
p

p +  geom_point(aes(color = Gender)) # デフォルト

p +  geom_point(aes(color = Gender)) + 
  scale_colour_manual(values = c(Male  = "black", Female  = "red")) # 色を指定した

p +  geom_point(size = 3, aes(shape = Gender)) # 丸と三角

p + stat_smooth(method = "lm") # 回帰直線を追加

summary(data)
##    Sample ID          Age           Gender       Height     
##  Min.   : 1.00   Min.   :10.00   Male  :35   Min.   :129.0  
##  1st Qu.:20.75   1st Qu.:25.75   Female:45   1st Qu.:153.0  
##  Median :40.50   Median :37.00               Median :158.5  
##  Mean   :40.50   Mean   :35.60               Mean   :158.6  
##  3rd Qu.:60.25   3rd Qu.:46.00               3rd Qu.:165.2  
##  Max.   :80.00   Max.   :60.00               Max.   :177.0  
##      Weight           BMI            Food_A      
##  Min.   :30.00   Min.   :16.00   Min.   :  7.20  
##  1st Qu.:44.75   1st Qu.:19.00   1st Qu.: 37.75  
##  Median :50.00   Median :19.80   Median : 59.00  
##  Mean   :52.08   Mean   :20.62   Mean   : 71.27  
##  3rd Qu.:56.25   3rd Qu.:21.60   3rd Qu.: 93.25  
##  Max.   :80.00   Max.   :27.10   Max.   :360.00
p <- ggplot(data,     # データを指定
            aes(Height, Weight)) +  # x, y軸をそれぞれ指定
  geom_point(aes(color = Gender)) # 性差について色分け
p 

p <- ggplot(data, aes(x = Gender, y = BMI))
p + geom_boxplot() # 箱ひげ図

# 身長の分布
ggplot(data, aes(x = Height)) + geom_density()

ggplot(data, aes(x = Height)) + geom_density(aes(colour = Gender))

ggplot(data, aes(x = Height)) + geom_histogram(fill = "white", colour = "black") +
  facet_grid(Gender ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# お遊び〜Food_Aと体重の関係は?
ggplot(data, aes(x = Weight, y = Food_A)) + geom_point()

# 正規性が無いのでSpearmanの相関係数
ggplot(data, aes(x = Weight)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data, aes(x = Food_A)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cor.test(data$Food_A, #xを指定
         data$Weight,    #yを指定
         method="spearman") #Spearman's correlationを指定
## Warning in cor.test.default(data$Food_A, data$Weight, method = "spearman"):
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  data$Food_A and data$Weight
## S = 62755, p-value = 0.01776
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2644733
# p値と相関係数は?
# ここまで

ggplot(data, aes(x = Food_A)) + geom_histogram(fill = "white", colour = "black") +
  facet_grid(Gender ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

viridis colour

参照 視覚多様性の時代

# viridis theme
data <- data.frame(x = rnorm(10000), y = rnorm(10000))
# viridis
g1 <- ggplot(data, aes(x = x, y = y)) + geom_hex() + coord_fixed() +
  scale_fill_viridis_c() +
  theme_bw()
# magma
g2 <- ggplot(data, aes(x, y)) + geom_hex() + coord_fixed() +
  scale_fill_viridis_c(option = "magma") +
  theme_bw()
# inferno
g3 <- ggplot(data, aes(x, y)) + geom_hex() + coord_fixed() +
  scale_fill_viridis_c(option = "inferno") +
  theme_bw()
# plasma
g4 <- ggplot(data, aes(x, y)) + geom_hex() + coord_fixed() +
  scale_fill_viridis_c(option = "plasma") +
  theme_bw()
# cividis
g5 <- ggplot(data, aes(x, y)) + geom_hex() + coord_fixed() +
  scale_fill_viridis_c(option = "cividis") +
  theme_bw()
# default colour of ggplot2
g6 <- ggplot(data, aes(x, y)) + geom_hex() + coord_fixed() + theme_bw()
gridExtra::grid.arrange(g1, g2, g3, g4, g5, g6)

gplotsパッケージのheatmap.2関数を使ってヒートマップ作成

heatmap_2 <-  read.table("heatmap_2.txt", header = T, sep = "\t", row.names = 1)
head(heatmap_2)
##         GSM995221_AD01M001.CEL.gz GSM995222_AD01M002.CEL.gz
## Il1r2                    9.709961                  9.499286
## Il1rl1                  10.869457                 10.595727
## Il18rap                  8.661484                  8.596499
## Icos                    11.556232                 11.625647
## Fam129a                 10.466855                 10.324924
## Rgs16                   10.649960                 10.630798
##         GSM995223_AD01M003.CEL.gz GSM995224_AD01M004.CEL.gz
## Il1r2                    8.653889                  8.677315
## Il1rl1                   9.412765                  9.279133
## Il18rap                  7.772801                  7.761333
## Icos                    11.010891                 10.862727
## Fam129a                  9.845083                  9.640511
## Rgs16                   10.320202                 10.154570
##         GSM995225_AD01M005.CEL.gz GSM995226_AD01M006.CEL.gz
## Il1r2                    6.766613                  7.282546
## Il1rl1                   6.696681                  6.555741
## Il18rap                  6.975710                  6.711794
## Icos                     8.990778                  9.695695
## Fam129a                  8.449838                  9.240955
## Rgs16                    9.282057                  9.599733
##         GSM995227_AD01M007.CEL.gz GSM995228_AD01M008.CEL.gz
## Il1r2                    7.030207                  7.114095
## Il1rl1                   6.561950                  6.557071
## Il18rap                  6.987141                  6.770077
## Icos                     9.515691                  9.177716
## Fam129a                  8.827236                  8.570377
## Rgs16                    9.579403                  9.278177
colnames(heatmap_2) <- c("1", "2", "3", "4", "5", "6", "7", "8")
head(heatmap_2)
##                 1         2         3         4        5        6        7
## Il1r2    9.709961  9.499286  8.653889  8.677315 6.766613 7.282546 7.030207
## Il1rl1  10.869457 10.595727  9.412765  9.279133 6.696681 6.555741 6.561950
## Il18rap  8.661484  8.596499  7.772801  7.761333 6.975710 6.711794 6.987141
## Icos    11.556232 11.625647 11.010891 10.862727 8.990778 9.695695 9.515691
## Fam129a 10.466855 10.324924  9.845083  9.640511 8.449838 9.240955 8.827236
## Rgs16   10.649960 10.630798 10.320202 10.154570 9.282057 9.599733 9.579403
##                8
## Il1r2   7.114095
## Il1rl1  6.557071
## Il18rap 6.770077
## Icos    9.177716
## Fam129a 8.570377
## Rgs16   9.278177
class(heatmap_2)
## [1] "data.frame"
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(viridis)
## Loading required package: viridisLite
# png("heatmap.png", width = 400, height = 800)
heatmap.2(as.matrix(heatmap_2), col = bluered(256), trace = "none", cexCol = 0.9, cexRow = 0.5, labRow = F)

heatmap.2(as.matrix(heatmap_2), col = bluered(256), trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row", labRow = F) # scaling

# Seurat colour
heatmap.2(as.matrix(heatmap_2), col = colorRampPalette(c("#FF00FF", "#000000", "#FFFF00")), trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row", labRow = F)

heatmap.2(as.matrix(heatmap_2), col = viridis, trace = "none", cexCol = 0.9, cexRow = 0.5, labRow = F)

heatmap.2(as.matrix(heatmap_2), col = viridis, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row", labRow = F)

heatmap.2(as.matrix(heatmap_2), col = viridis, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "col", labRow = F)

heatmap.2(as.matrix(heatmap_2), col = magma, trace = "none", cexCol = 0.9, cexRow = 0.5, labRow = F)

# scaling
heatmap.2(as.matrix(heatmap_2), col = magma, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row", labRow = F)

heatmap.2(as.matrix(heatmap_2), col = plasma, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row", labRow = F)

heatmap.2(as.matrix(heatmap_2), col = inferno, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row", labRow = F)

heatmap.2(as.matrix(heatmap_2), col = cividis, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row", labRow = F)

heatmap.2(as.matrix(heatmap_2), col = viridis, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row", labRow = F)

同じような作業を繰り返すなら簡単なプログラミングで効率的に この程度ならfor構文で構わない map関数でもできると思う(勉強不足)

for (i in c(viridis, magma, plasma, inferno, cividis)) {
  heatmap.2(as.matrix(heatmap_2), col = i, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row")
}

complexheatmapを用いると複雑なヒートマップ描画できるが今回は割愛

地図かける geom_sf()

動画も作れる gganimate

バインフォ的なプロット

MAプロットとボルケーノプロット マイクロアレイやRNA-seqの発現変動遺伝子を示す際によく用いられる

### 少しはバイオインフォマティクス的なプロットも
# MA-plotやvolcano plotを作図してみる
merged <- readr::read_delim("merged_annot.txt", delim = " ")
## Parsed with column specification:
## cols(
##   PROBEID = col_character(),
##   logFC = col_double(),
##   AveExpr = col_double(),
##   t = col_double(),
##   P.Value = col_double(),
##   adj.P.Val = col_double(),
##   B = col_double(),
##   SYMBOL = col_character(),
##   GENENAME = col_character()
## )
str(merged)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 15297 obs. of  9 variables:
##  $ PROBEID  : chr  "ILMN_1763207" "ILMN_1739001" "ILMN_2384056" "ILMN_2393765" ...
##  $ logFC    : num  -5.87 -5.57 -5.48 -5.45 -5.45 ...
##  $ AveExpr  : num  8.83 7.61 8.66 12.69 7.34 ...
##  $ t        : num  -6.64 -5.96 -5.3 -5.89 -10.46 ...
##  $ P.Value  : num  2.17e-04 4.36e-04 8.98e-04 4.71e-04 9.79e-06 ...
##  $ adj.P.Val: num  0.0631 0.0845 0.1083 0.0888 0.0231 ...
##  $ B        : num  1.155 0.506 -0.182 0.435 3.774 ...
##  $ SYMBOL   : chr  "BATF3" "TACSTD2" "GPER1" "IGLL1" ...
##  $ GENENAME : chr  "basic leucine zipper ATF-like transcription factor 3" "tumor associated calcium signal transducer 2" "G protein-coupled estrogen receptor 1" "immunoglobulin lambda like polypeptide 1" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   PROBEID = col_character(),
##   ..   logFC = col_double(),
##   ..   AveExpr = col_double(),
##   ..   t = col_double(),
##   ..   P.Value = col_double(),
##   ..   adj.P.Val = col_double(),
##   ..   B = col_double(),
##   ..   SYMBOL = col_character(),
##   ..   GENENAME = col_character()
##   .. )
head(merged)
## # A tibble: 6 x 9
##   PROBEID  logFC AveExpr      t P.Value adj.P.Val      B SYMBOL GENENAME   
##   <chr>    <dbl>   <dbl>  <dbl>   <dbl>     <dbl>  <dbl> <chr>  <chr>      
## 1 ILMN_17… -5.87    8.83  -6.64 2.17e-4    0.0631  1.16  BATF3  basic leuc…
## 2 ILMN_17… -5.57    7.61  -5.96 4.36e-4    0.0845  0.506 TACST… tumor asso…
## 3 ILMN_23… -5.48    8.66  -5.30 8.98e-4    0.108  -0.182 GPER1  G protein-…
## 4 ILMN_23… -5.45   12.7   -5.89 4.71e-4    0.0888  0.435 IGLL1  immunoglob…
## 5 ILMN_16… -5.45    7.34 -10.5  9.79e-6    0.0231  3.77  IDO1   indoleamin…
## 6 ILMN_16… -5.29    7.53  -8.54 4.03e-5    0.0332  2.64  CDC20  cell divis…
DE_sig <- dplyr::filter(merged, adj.P.Val <= 0.05, SYMBOL != "NA")
# FDRが0.05以下の遺伝子のみ抜き出す
ma <- ggplot(data = merged, aes(x = AveExpr, y = logFC)) + 
  geom_point(size = 0.1) +
  geom_point(data = DE_sig, colour = "red", size = 0.3) + 
  theme_classic()
ma + ggtitle("MA plot")

library(ggrepel)
ma + geom_text_repel(data = DE_sig, aes(label = SYMBOL))

vp <- ggplot(data = merged, aes(x = logFC, y = -log10(adj.P.Val))) + 
  geom_point(size = 0.1) +
  geom_point(data = DE_sig, colour = "red", size = 0.3) + 
  theme_classic()
vp

vp + ggtitle("volcano plot of microarray")

vp + geom_text_repel(data = DE_sig, aes(label = SYMBOL))

ggseqlogoパッケージ使うとggplot2でsequence logoを描画できる。ggplot2の要領で図に色々と書き込める点も便利!時代はtidyverse!

library(MotifDb)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unsplit, which,
##     which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:gplots':
## 
##     space
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.6.1
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## See system.file("LICENSE", package="MotifDb") for use restrictions.
library(seqLogo)
## Loading required package: grid
library(ggseqlogo)
# motif visualisation

PotentialIRF8 <- as.list(subset (MotifDb,
                                tolower (geneSymbol)=="irf8"))
# seqlogo
PotentialIRF8[[2]]
##           1          2          3          4          5          6
## A 0.1621622 0.45945946 0.13513514 0.54054054 0.97297297 0.91891892
## C 0.1621622 0.10810811 0.05405405 0.02702703 0.00000000 0.02702703
## G 0.5135135 0.35135135 0.75675676 0.43243243 0.02702703 0.02702703
## T 0.1621622 0.08108108 0.05405405 0.00000000 0.00000000 0.02702703
##           7         8          9         10         11         12
## A 0.2702703 0.1081081 0.00000000 0.94594595 0.89189189 0.86486486
## C 0.3783784 0.1081081 0.05405405 0.00000000 0.05405405 0.00000000
## G 0.3513514 0.1081081 0.91891892 0.05405405 0.05405405 0.08108108
## T 0.0000000 0.6756757 0.02702703 0.00000000 0.00000000 0.05405405
##          13         14        15
## A 0.1621622 0.05405405 0.3513514
## C 0.3513514 0.35135135 0.1351351
## G 0.3513514 0.10810811 0.4054054
## T 0.1351351 0.48648649 0.1081081
seqLogo(PotentialIRF8[[2]]) # mouse

seqLogo(PotentialIRF8[[4]]) # human

# ggseqlogo
ggseqlogo(PotentialIRF8[[2]])

ggseqlogo(PotentialIRF8[[4]])

p1 <- ggseqlogo(PotentialIRF8[[2]], method = "bits")
p2 <- ggseqlogo(PotentialIRF8[[2]], method = "prob")
gridExtra::grid.arrange(p1, p2)

p3 <- ggplot() + geom_logo(PotentialIRF8[[2]]) + theme_classic() + 
  scale_y_continuous(expand = c(0,0), limits = c(0, 2.0)) + 
  scale_x_continuous(expand = c(0,0), breaks = seq(1,15,1))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
gridExtra::grid.arrange(p1, p3)

# add annotation
ggplot() + 
  annotate('rect', xmin = 2.5, xmax = 6.5, ymin = -0.05, ymax = 1.9, alpha = .1, col='black', fill='yellow') +
  annotate('rect', xmin = 8.5, xmax = 12.5, ymin = -0.05, ymax = 1.9, alpha = .1, col='black', fill='yellow') +
  geom_logo(PotentialIRF8[[2]], stack_width = 0.90) + 
  annotate('text', x=4.5, y=2, label='IRF or ETS') + 
  annotate('text', x=10.5, y=2, label='IRF motif') + 
  theme_logo()

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggrepel_0.8.1     viridis_0.5.1     viridisLite_0.3.0 gplots_3.0.1.1   
## [5] ggplot2_3.2.1    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.2         plyr_1.8.4         pillar_1.4.2      
##  [4] compiler_3.6.0     bitops_1.0-6       tools_3.6.0       
##  [7] zeallot_0.1.0      digest_0.6.20      evaluate_0.14     
## [10] tibble_2.1.3       gtable_0.3.0       pkgconfig_2.0.2   
## [13] rlang_0.4.0        cli_1.1.0          yaml_2.2.0        
## [16] xfun_0.9           gridExtra_2.3      withr_2.1.2       
## [19] dplyr_0.8.3        stringr_1.4.0      knitr_1.24        
## [22] caTools_1.17.1.2   gtools_3.8.1       hms_0.5.1         
## [25] vctrs_0.2.0        grid_3.6.0         tidyselect_0.2.5  
## [28] glue_1.3.1         R6_2.4.0           fansi_0.4.0       
## [31] rmarkdown_1.15     gdata_2.18.0       reshape2_1.4.3    
## [34] readr_1.3.1        purrr_0.3.2        magrittr_1.5      
## [37] scales_1.0.0       backports_1.1.4    htmltools_0.3.6   
## [40] assertthat_0.2.1   colorspace_1.4-1   labeling_0.3      
## [43] KernSmooth_2.23-15 utf8_1.1.4         stringi_1.4.3     
## [46] lazyeval_0.2.2     munsell_0.5.0      crayon_1.3.4
date()
## [1] "Sat Sep  7 10:33:42 2019"